{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 学习离散概率"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# https://plot.ly/python/discrete-frequency/\n",
    "\n",
    "import plotly as py\n",
    "import plotly.graph_objs as go\n",
    "import plotly.figure_factory as ff\n",
    "\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "\n",
    "import warnings\n",
    "warnings.filterwarnings(\"ignore\", message=\"numpy.dtype size changed\")\n",
    "warnings.filterwarnings(\"ignore\", message=\"numpy.ufunc size changed\")\n",
    "\n",
    "py.offline.init_notebook_mode(connected=True)\n",
    "\n",
    "# 本节主要运用histnorm的参数: 'density|probability|percent|'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 通过控制bins的size来通过Histogram渐进展示分布 "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 数据导入"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_data = pd.read_csv(filepath_or_buffer='./2010_alcohol_consumption_by_country.csv')\n",
    "nd_data = df_data.values\n",
    "\n",
    "type(df_data), type(df_data.values)\n",
    "\n",
    "fig = ff.create_table(table_text=df_data[0:10]) # nd_data\n",
    "\n",
    "py.offline.iplot(fig)\n",
    "\n",
    "alcohol_ss_data = df_data['alcohol']\n",
    "\n",
    "type(alcohol_ss_data)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 频数展示"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "trace_hist = go.Histogram(\n",
    "    x = alcohol_ss_data,\n",
    "    xbins = dict(\n",
    "        start = np.min(alcohol_ss_data),\n",
    "        size = .5,\n",
    "        end = np.max(alcohol_ss_data),\n",
    "    ),\n",
    "    histnorm = 'density', \n",
    "    name = 'trace-name-t1',\n",
    "    marker = dict(\n",
    "        color='#0000FF'\n",
    "    ),\n",
    ")\n",
    "\n",
    "layout = go.Layout(\n",
    "    title = 'layout-title-density',\n",
    ")\n",
    "\n",
    "fig = go.Figure(data = [trace_hist], layout = layout)\n",
    "py.offline.iplot(fig)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 概率分布"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "trace_hist.histnorm = 'probability'\n",
    "fig = go.Figure(data = [trace_hist], layout = layout)\n",
    "fig.layout.title = 'layout-title-probability'\n",
    "py.offline.iplot(fig)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 百分比展示"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "trace_hist.histnorm = 'percent'\n",
    "\n",
    "fig = go.Figure(data = [trace_hist], layout = layout)\n",
    "fig.layout.title = 'layout-title-percent'\n",
    "py.offline.iplot(fig)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 累积分布\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "alcohol_ss_cum = np.cumsum(alcohol_ss_data)\n",
    "print(np.min(alcohol_ss_data), np.max(alcohol_ss_data), alcohol_ss_data.size, alcohol_ss_cum.size)\n",
    "print(alcohol_ss_data[0:5])\n",
    "print(\"\\n\")\n",
    "print(alcohol_ss_cum[0:5])\n",
    "\n",
    "# 感觉这个例子有问题\n",
    "trace = go.Scatter(\n",
    "    x = [i for i in range(len(alcohol_ss_cum))],\n",
    "    y = alcohol_ss_cum / alcohol_ss_cum.iloc[-1],\n",
    "    marker = dict(\n",
    "        color = '#0000FF',\n",
    "    )\n",
    ")\n",
    "\n",
    "fig = go.Figure(data = [trace], layout=layout)\n",
    "fig.layout.title = \"CDF\"\n",
    "py.offline.iplot(fig)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "alcohol_ss_sort_data = alcohol_ss_data.sort_values()\n",
    "print(alcohol_ss_sort_data[-10:])\n",
    "alcohol_ss_sort_cum = np.cumsum(alcohol_ss_sort_data)\n",
    "\n",
    "trace2 = go.Scatter(\n",
    "    x = alcohol_ss_sort_data.tolist(),\n",
    "    y = alcohol_ss_sort_cum / alcohol_ss_sort_cum.iloc[-1],\n",
    "    marker = dict(\n",
    "        color = '#0000FF',\n",
    "    )\n",
    ")\n",
    "\n",
    "fig = go.Figure(data = [trace2], layout=layout)\n",
    "fig.layout.title = \"CDF2\"\n",
    "py.offline.iplot(fig)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.4.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
